******************************************************************************************
* Do-file name:	cr_clean_data.do
* Task:         clean data, generate new variables, fill-in missing information, merge 
*               imputed education & wages, merge distance
* Last change:  04.03.2022
******************************************************************************************



******************************************************************************************
*** program setup
******************************************************************************************

version 14.2
clear all
macro drop _all
set linesize 90
set more off
* set trace on
discard
set seed 123456789


******************************************************************************************
*** load treatment and control sample
******************************************************************************************

*** load treatment and control sample
use "data/treat_control_sample.dta", clear 

*** check sample
isid vsnr_ano year
assert border_imp  != 1  if  control_imp == 1
assert control_imp != 1  if  border_imp  == 1
tab border_imp control_imp, m


******************************************************************************************
*** clean data 
******************************************************************************************

*** label vsnr_ano
label variable vsnr_ano "Person ID"

*** correct ao_gem values
replace ao_gem = . if ao_gem < 0
replace ao_gem 	   = 11000000  if ao_gem     == 11100000 | ao_gem     == 11200000  // Recode all parts of Berlin into one code

*** correct bnr_ano_n values
replace bnr_ano_n = .  if bnr_ano_n == 0
replace bnr_ano_n = .  if bnr_ano_n >= 99999999
label variable bnr_ano_n "firm identification number"

*** rename alter
rename alter age
label variable age "age (birthm and birthj used for calculation)"

*** label dauer
label variable dauer "length of employment (in days)"

*** correct staat
replace staat = .  if staat < 0
*replace staat = .  if inlist(staat, 7, 888, 899, 998, 999)
replace staat = .  if staat > 990
label variable staat "nationality"

*** correct beruford
replace beruford = .  if beruford < 0
replace beruford = .  if inlist(beruford, 0, 310, 677, 999)
label variable beruford "occupation"

*** correct berufstg
replace berufstg = .  if berufstg < 0
replace berufstg = .  if inlist(berufstg, 5, 6)
label variable berufstg "type of job (0-7 full-time, 8&9 part-time job)"

*** correct ausbild
replace ausbild = .  if ausbild < 0
replace ausbild = .  if inlist(pers_gr, 205, 301, 302, 303, 304) 
replace ausbild = .  if inlist(ausbild, 7, 8, 9)
label variable ausbild "education (original data)"

*** correct pers_gr
replace pers_gr = .  if pers_gr < 0
replace pers_gr = .  if inlist(pers_gr, 999)
label variable pers_gr "employment relationship (legal classification)"

*** correct wz73
replace wz73 = .  if wz73 < 0
replace wz73 = .  if inlist(wz73, 46, 999)
label variable wz73 "industry classification 1973"

*** correct estsize
replace estsize = .  if bnr_ano_n == .

*** rename ten
rename ten tenure
label variable tenure "length of uninterrupted employ. at same firm (in years)"

*** calculate all values in euros (until 1998 in dm) & label tag_entg
replace tag_entg = tag_entg / 1.95583  if year <= 1998
label variable tag_entg "average daily wage (in euro)"

*** drop variables
drop  wo_gem  wz93  	// wz03, wz08 are only included in later data versions 


******************************************************************************************
*** fill in missings for sex and age using past and future values 
******************************************************************************************

*** fill in sex
** fill in missings with past values	
bys vsnr_ano (year): replace sex = sex[_n-1]  if sex == . & sex[_n-1] != .

** fill in missings with future values
gsort vsnr_ano -year
by vsnr_ano: replace sex = sex[_n-1]  if sex == . & sex[_n-1] != .
sort vsnr_ano year

*** fill in age
** fill in missings with past values
bys vsnr_ano (year): replace age = age[_n-1] + 1  if age == . & age[_n-1] != .

** fill in missings with future values
gsort vsnr_ano -year
by vsnr_ano: replace age = age[_n-1] - 1  if age == . & age[_n-1] != .
sort vsnr_ano year 


******************************************************************************************
*** fill in missings for staat and make it time constant (only past values used in my version, qje version uses past and future values)
******************************************************************************************

** create staat variable
gen nation = staat

* replicate first record of non-German nationality in both directions
gsort + vsnr + year
replace nation = nation[_n-1] if vsnr==vsnr[_n-1] & nation[_n-1]!=0 & nation[_n-1]<.
gsort + vsnr - year
replace nation = nation[_n-1] if vsnr==vsnr[_n-1] & nation[_n-1]!=0 & nation[_n-1]<.

* fill in blanks for Germans
gsort + vsnr + year
replace nation = nation[_n-1] if vsnr==vsnr[_n-1] & nation==. & nation[_n-1]!=.
gsort + vsnr - year
replace nation = nation[_n-1] if vsnr==vsnr[_n-1] & nation==. & nation[_n-1]!=.
label variable nation "nationality (imputed, past and future values used)"


** use first non-missing value to fill in missings and overwrite non-missing values with past values
gen staat2 = staat
bys vsnr_ano (year): replace staat2 = staat2[_n-1]  if staat2[_n-1] != .
label variable staat2 "nationality (imputed, first non-missing value used, time constant)"

** fill in missings with past values		// not time constant, some individuals switch back and forth
bys vsnr_ano (year): replace staat = staat[_n-1]  if staat == . & staat[_n-1] != .
label variable staat "nationality (missings imputed by past values)"


******************************************************************************************
*** fill in missings for education (only past values used)
******************************************************************************************

** use past values to fill in missings and overwrite non-missing values if past value > current value
gen ausbild2 = ausbild
bys vsnr_ano (year): replace ausbild2 = ausbild2[_n-1]  if ausbild2[_n-1] != . & (ausbild2 == . | ausbild2[_n-1] >= ausbild2) 

* correct problem with change from category 2 to category 3, i.e. from category 2 only change in category >= 4 meaningful
gen help_edu = ausbild  if ausbild == 2
bys vsnr_ano (year): replace help_edu = help_edu[_n-1]  if help_edu[_n-1] != . 
gen ausbild3 = ausbild2 
replace ausbild3 = 4  if help_edu == 2 & ausbild3 == 3
drop ausbild2 help_edu
rename ausbild3 ausbild2
label variable ausbild2 "education (imputed, past educ used if >= current educ, category prob. corrected)"

** fill in missings with past values		// education switches up and down
bys vsnr_ano (year): replace ausbild = ausbild[_n-1]  if ausbild == . & ausbild[_n-1] != .
label variable ausbild "education (missings imputed by past values)"


******************************************************************************************
*** merge imputed education to dataset & fill in missings (only past values used)
******************************************************************************************

*** save sample
save "data/work.dta", replace

*** prepare imputed education data for merge with working sample
forvalues x = 1980 / 2000 {
use "data/educationimputationall`x'.dta", clear
gen year = `x'
rename vsnr vsnr_ano
sort vsnr_ano year
save "data/imp_edu_year_`x'.dta", replace

*** merge imputed education data to working sample
if `x' == 1980 {
use "data/work.dta", clear
merge 1:1 vsnr_ano year using "data/imp_edu_year_`x'.dta"
drop if _merge == 2
drop _merge
save "data/work.dta", replace
erase "data/imp_edu_year_`x'.dta"
	}
else {
use "data/work.dta", clear
merge 1:1 vsnr_ano year using "data/imp_edu_year_`x'.dta", update
drop if _merge == 2
drop _merge
save "data/work.dta", replace
erase "data/imp_edu_year_`x'.dta"
	}
	}

** fill in missings with past values		// values of imputed education are already only constant or increasing over time
bys vsnr_ano (year): replace ip1 = ip1[_n-1]  if ip1 == . & ip1[_n-1] != .
label variable ip1 "imputed education (+ missings imputed by past values)"	

bys vsnr_ano (year): replace imp_edu = imp_edu[_n-1]  if imp_edu == . & imp_edu[_n-1] != .
label variable imp_edu "imp. education, 3 groups (+ missings imputed by past values)"


******************************************************************************************
*** merge imputed wages to dataset 
******************************************************************************************

sort vsnr_ano year
merge 1:1 vsnr_ano year using "data/impy1980_2000.dta"
drop if _merge == 2
drop _merge		
label variable impy   "log avg. daily wage (in euro, imputed)"
save "data/work.dta", replace


******************************************************************************************
*** create variables for wage growth regressions 
******************************************************************************************

*** wage growth
** prepare impy for wage growth variable: fill in missings in 1990 for status 2&3 with past values of imputed wage (max 4 years back in past)
gen impy_2 = impy
gen no_lag_impy_2  = 0  if impy_2 != .

forvalues x = 1/4 {
bys vsnr_ano (year): replace impy_2 = impy[_n-`x']  if impy_2 == . & impy[_n-`x'] != . & year == 1990 & (status == 2 | status == 3)
bys vsnr_ano (year): replace no_lag_impy_2 = 0-`x'  if no_lag_impy_2 == . & impy_2 != . & impy[_n-`x'] != . & year == 1990
 }

gen 	wage_year = .
replace wage_year = year + no_lag_impy_2  if year == 1990 & impy == . 
label variable impy_2 "log avg. daily wage (in euro, imputed, missings replaced with max 4 years lagged)"
label variable no_lag_impy_2  "no. of years going back to impute log wage (based on impy_2, max 4 years)"
save "data/work.dta", replace

** calculate average wage growth to adjust past wages to wage level in 1990 
use "data/impy1980_2000.dta", clear 
keep if year >= 1986 & year <= 1990
keep if impy != .
collapse (mean) impy, by(year)

gen  impy_h = impy if year == 1990
egen impy_90 = max(impy_h)
drop impy_h

gen wage_growth = impy_90 - impy
rename year wage_year
gen year = 1990

save "data/wage_growth_86_90.dta", replace

** merge average wage growth to work dataset
use "data/work.dta", clear

merge m:1 year wage_year using "data/wage_growth_86_90.dta", keepusing(year wage_year wage_growth)
drop if _merge == 2
drop _merge

** adjust past wages to wage level in 1990 using avg. growth rate from imputed wages
replace impy_2 = impy_2 + wage_growth  if year == 1990 & wage_growth != .
drop no_lag_impy_2 wage_year wage_growth

** occupation: fill in missings in 1990 for status 2&3 with past values of berufstg (max 4 years back in past)
gen beruford_imp = beruford
label variable beruford_imp "occupation (based on beruford, missings replaced with max 4 years lagged)"

forvalues x = 1/4 {
bys vsnr_ano (year): replace beruford_imp = beruford[_n-`x']  if beruford_imp == . & beruford[_n-`x'] != . & year == 1990 & (status == 2 | status == 3)
 }

*** industry classification 1973: fill in missings in 1990 for status 2&3 with past values of wz73 (max 4 years back in past)
gen wz73_imp = wz73
label variable wz73_imp "industry classification 1973 (based on wz73, missings replaced with max 4 years lagged)"

forvalues x = 1/4 {
bys vsnr_ano (year): replace wz73_imp = wz73[_n-`x']  if wz73_imp == . & wz73[_n-`x'] != . & year == 1990 & (status == 2 | status == 3)
 }

save "data/work.dta", replace


******************************************************************************************
*** generate new variables 
******************************************************************************************

*** flag west-german districts close to the inner German border, distinguish two distances
gen border_imp_ddr = 0

** close distance to border 
replace border_imp_ddr = 1  if inlist(ao_kreis_imp, 1003, 1053, 1055, 1062, 3355) | ///
		                       inlist(ao_kreis_imp, 3360, 3354, 3151, 3103, 3154) | ///
		                       inlist(ao_kreis_imp, 3158, 3153, 3156, 3152, 6636) | ///
						       inlist(ao_kreis_imp, 6632, 6631, 9673, 9674, 9473) | ///
    	                       inlist(ao_kreis_imp, 9469, 9476, 9475, 9464)		

** further away	from border 	
replace border_imp_ddr = 2  if inlist(ao_kreis_imp, 1057, 1060, 2000, 3353, 3351) | ///
                               inlist(ao_kreis_imp, 3157, 3101, 3102, 3158, 3254) | ///
						       inlist(ao_kreis_imp, 3155, 6633, 6611, 6634, 6535) | ///
						       inlist(ao_kreis_imp, 9672, 9662, 9678, 9471, 9461) | ///
						       inlist(ao_kreis_imp, 9478, 9477, 9462, 9472, 9479, 9377)							

replace border_imp_ddr = .  if ao_kreis_imp == .
label variable border_imp_ddr "west-german districts in BRD/DDR border region: 1 if dist.<40km, 2 if dist.<80km"	

*** create female dummy
gen 	female = 1  if sex == 2
replace female = 0  if sex == 1
label variable female  "1 if female, 0 if male"
drop sex

*** create nationality group variable based on qje version (variable = nation)
gen 	nation_gr = 1  if nation == 0 	// 1 = germans 
replace nation_gr = 2  if inlist(nation, 155, 162, 164) 	// 2 = czechs
replace nation_gr = 3  if !inlist(nation, 0, 155, 162, 164) & nation != .   // 3 = all other foreigners
label variable nation_gr "nationality: 1=germans, 2=czechs, 3=all other foreigners (based on nation)"

*** create nationality group variable
gen 	nation_gr2 = 1  if staat2 == 0 	// 1 = germans 
replace nation_gr2 = 2  if inlist(staat2, 155, 162, 164) 	// 2 = czechs
replace nation_gr2 = 3  if !inlist(staat2, 0, 155, 162, 164) & staat2 != .   // 3 = all other foreigners
label variable nation_gr2 "nationality: 1=germans, 2=czechs, 3=all other foreigners (based on staat2)"

*** create full-time equivalents 
gen weight_fte = .
replace weight_fte = 1    if berufstg >= 0 & berufstg <= 7
replace weight_fte = 0.5  if berufstg == 8			// working hours/week < 18
replace weight_fte = 2/3  if berufstg == 9			// working hours/week > 18, but not full-time
label variable weight_fte "full-time equivalent weights: 1 for full-time, 1/2 and 2/3 for part-time"

*** create 3 education groups (based on ausbild2)
gen     edu_3 = .
replace edu_3 = 1  if ausbild2 == 0 | ausbild2 == 1 | ausbild2 == 3
replace edu_3 = 2  if ausbild2 == 2 | ausbild2 == 4
replace edu_3 = 3  if ausbild2 == 5 | ausbild2 == 6
label define edu_3 1 "[1] None or only a school degree" 2 "[2] School and vocational" 3 "[3] Technical college / university"
label values edu_3 edu_3
label variable edu_3 "education, 3 groups (based on ausbild2)"

*** create 2 education groups (based on ausbild2 and ip1)
gen     edu_2 = .
replace edu_2 = 1  if ausbild2 == 0 | ausbild2 == 1 | ausbild2 == 3
replace edu_2 = 2  if ausbild2 == 2 | ausbild2 == 4 | ausbild2 == 5 | ausbild2 == 6
label variable edu_2 "education, 1=low educ, 2=high educ (based on ausbild2)"

gen     imp_edu_2 = imp_edu
replace imp_edu_2 = 2  if imp_edu == 3
label variable imp_edu_2 "education, 1=low educ, 2=high educ (based on imp_edu)"

*** create indicator for east germany (berlin classified as west germany)
gen 	ost = 1  if ao_kreis_imp >  11000 & ao_kreis_imp != .
replace ost = 0  if ao_kreis_imp <= 11000
label variable ost "1 if district belongs to east germany (berlin west ger., based on ao_kreis)"

*** create wage including censoring limits & real wages (non-imputed, based on tag_entg & tag_entg_cens)
do "do-files\data_management/cr_real_wages_ado.do"


******************************************************************************************
*** merge industry classification to dataset and create diff. sector classifications
******************************************************************************************

*** generate industry variable (define 28 industries following classification proposed K. Wolf & H. Ludsteck)
	// from 1975-2002 use wz73 (missing for later years)
sort wz73_imp year
rename wz73_imp w73
merge m:1 w73  using "data\orig/wz73-wz28.dta"
drop if _merge == 2			// 7 non-missing obs. in master are not matched
rename wz28 industry
label variable industry "28 industries (classification by Wolf & Ludsteck, based on wz73_imp)"
rename w73 wz73_imp
drop _merge

*** create two different sector classifications
** version 1: tag tradables and public sector industies in Wolf/Ludsteck 28-industries classification 
gen 	ind28_tradable = (industry >= 1 & industry <= 16)
replace ind28_tradable = .  if industry == .
label variable ind28_tradable "1 if tradable industry (based on industry, 28 industries classif.)"
gen 	ind28_publicsector = (industry == 22 | industry == 24 | industry == 27 | industry == 28)
replace ind28_publicsector = .  if industry == .
label variable ind28_publicsector "1 if public sector (based on industry, 28 industries classif.)"

** version 2: tage tradables, (tradable) manufacturing, and non-tradables as in Glitz and Dustmann (2011)
gen 	wz73_class = .
replace wz73_class = 2 if int(wz73_imp/10) == 0
replace wz73_class = 0 if int(wz73_imp/10) == 1
replace wz73_class = 0 if int(wz73_imp/10) == 2
replace wz73_class = 0 if int(wz73_imp/10) == 3
replace wz73_class = 0 if int(wz73_imp/10) == 4
replace wz73_class = 2 if int(wz73_imp/10) == 5
replace wz73_class = 2 if int(wz73_imp/10) == 6
replace wz73_class = 2 if int(wz73_imp/10) == 7
replace wz73_class = 2 if int(wz73_imp/10) == 8
replace wz73_class = 1 if int(wz73_imp/10) == 9
replace wz73_class = 1 if int(wz73_imp/10) == 10
replace wz73_class = 1 if int(wz73_imp/10) == 11
replace wz73_class = 1 if int(wz73_imp/10) == 12
replace wz73_class = 1 if int(wz73_imp/10) == 13
replace wz73_class = 1 if int(wz73_imp/10) == 14
replace wz73_class = 1 if int(wz73_imp/10) == 15
replace wz73_class = 1 if int(wz73_imp/10) == 16
replace wz73_class = 1 if int(wz73_imp/10) == 17
replace wz73_class = 1 if int(wz73_imp/10) == 18
replace wz73_class = 1 if int(wz73_imp/10) == 19
replace wz73_class = 1 if int(wz73_imp/10) == 20
replace wz73_class = 1 if int(wz73_imp/10) == 21
replace wz73_class = 1 if int(wz73_imp/10) == 22
replace wz73_class = 1 if int(wz73_imp/10) == 23
replace wz73_class = 1 if int(wz73_imp/10) == 24
replace wz73_class = 0 if int(wz73_imp/10) == 25
replace wz73_class = 1 if int(wz73_imp/10) == 26
replace wz73_class = 1 if int(wz73_imp/10) == 27
replace wz73_class = 1 if int(wz73_imp/10) == 28
replace wz73_class = 1 if int(wz73_imp/10) == 29
replace wz73_class = 1 if int(wz73_imp/10) == 30
replace wz73_class = 1 if int(wz73_imp/10) == 31
replace wz73_class = 1 if int(wz73_imp/10) == 32
replace wz73_class = 1 if int(wz73_imp/10) == 33
replace wz73_class = 1 if int(wz73_imp/10) == 34
replace wz73_class = 1 if int(wz73_imp/10) == 35
replace wz73_class = 1 if int(wz73_imp/10) == 36
replace wz73_class = 1 if int(wz73_imp/10) == 37
replace wz73_class = 1 if int(wz73_imp/10) == 38
replace wz73_class = 1 if int(wz73_imp/10) == 39
replace wz73_class = 1 if int(wz73_imp/10) == 40
replace wz73_class = 1 if int(wz73_imp/10) == 41
replace wz73_class = 1 if int(wz73_imp/10) == 42
replace wz73_class = 1 if int(wz73_imp/10) == 43
replace wz73_class = 1 if int(wz73_imp/10) == 44
replace wz73_class = 1 if int(wz73_imp/10) == 45
replace wz73_class = 1 if int(wz73_imp/10) == 46
replace wz73_class = 1 if int(wz73_imp/10) == 47
replace wz73_class = 1 if int(wz73_imp/10) == 48
replace wz73_class = 1 if int(wz73_imp/10) == 49
replace wz73_class = 1 if int(wz73_imp/10) == 50
replace wz73_class = 1 if int(wz73_imp/10) == 51
replace wz73_class = 1 if int(wz73_imp/10) == 52
replace wz73_class = 1 if int(wz73_imp/10) == 53
replace wz73_class = 1 if int(wz73_imp/10) == 54
replace wz73_class = 1 if int(wz73_imp/10) == 55
replace wz73_class = 1 if int(wz73_imp/10) == 56
replace wz73_class = 1 if int(wz73_imp/10) == 57
replace wz73_class = 1 if int(wz73_imp/10) == 58
replace wz73_class = 0 if int(wz73_imp/10) == 59
replace wz73_class = 0 if int(wz73_imp/10) == 60
replace wz73_class = 0 if int(wz73_imp/10) == 61
replace wz73_class = 0 if int(wz73_imp/10) == 62
replace wz73_class = 0 if int(wz73_imp/10) == 63
replace wz73_class = 0 if int(wz73_imp/10) == 64
replace wz73_class = 0 if int(wz73_imp/10) == 65
replace wz73_class = 0 if int(wz73_imp/10) == 66
replace wz73_class = 0 if int(wz73_imp/10) == 67
replace wz73_class = 0 if int(wz73_imp/10) == 68
replace wz73_class = 2 if int(wz73_imp/10) == 69
replace wz73_class = 0 if int(wz73_imp/10) == 70
replace wz73_class = 0 if int(wz73_imp/10) == 71
replace wz73_class = 0 if int(wz73_imp/10) == 72
replace wz73_class = 0 if int(wz73_imp/10) == 73
replace wz73_class = 0 if int(wz73_imp/10) == 74
replace wz73_class = 0 if int(wz73_imp/10) == 75
replace wz73_class = 0 if int(wz73_imp/10) == 76
replace wz73_class = 0 if int(wz73_imp/10) == 77
replace wz73_class = 0 if int(wz73_imp/10) == 78
replace wz73_class = 2 if int(wz73_imp/10) == 79
replace wz73_class = 2 if int(wz73_imp/10) == 80
replace wz73_class = 2 if int(wz73_imp/10) == 81
replace wz73_class = 2 if int(wz73_imp/10) == 82
replace wz73_class = 0 if int(wz73_imp/10) == 83
replace wz73_class = 0 if int(wz73_imp/10) == 84
replace wz73_class = 0 if int(wz73_imp/10) == 85
replace wz73_class = 0 if int(wz73_imp/10) == 86
replace wz73_class = 0 if int(wz73_imp/10) == 87
replace wz73_class = 0 if int(wz73_imp/10) == 88
replace wz73_class = 0 if int(wz73_imp/10) == 89
replace wz73_class = 0 if int(wz73_imp/10) == 90
replace wz73_class = 0 if int(wz73_imp/10) == 91
replace wz73_class = 0 if int(wz73_imp/10) == 92
replace wz73_class = 0 if int(wz73_imp/10) == 93
replace wz73_class = 0 if int(wz73_imp/10) == 94
label var wz73_class "class. of wz73_imp by Glitz/Dustmann, 0=nontradable, 1=manufacturing, 2=tradable"
*label define wz73_class 0 "nontradable" 1 "manufacturing" 2 "tradable"
*label values wz73_class wz73_class


******************************************************************************************
*** merge bbr 2006 information on urban density of each district to dataset
******************************************************************************************

sort ao_kreis_imp
merge m:1 ao_kreis_imp using "data\orig/bbr_2006.dta"
tab ao_kreis_imp  if _merge == 1, m		
drop if _merge == 2 		
drop _merge
label variable ktyp "bbr district type (catg.: 1-9)"
*label define ktyp 5 "BBR district type 5: Central Cities in regions with intermediate agglomerations" 6 "BBR district type 6: Urbanised districts in regions with intermediate agglomerations" 7 "BBR district type 7: Rural districts in regions with intermediate agglomerations" 8 "BBR district type 8: Urbanised districts in rural regions" 9 "BBR district type 9: Rural districts in rural regions"
*label values ktyp ktyp


******************************************************************************************
*** merge distance to border to dataset 
******************************************************************************************

sort ao_gem_imp
merge m:1 ao_gem_imp using "data/ao_gem_map.dta", keepusing(ao_gem_imp distance)
drop if _merge == 2		// all municipalities in master are matched with a distance
drop _merge


******************************************************************************************
*** merge matching weights to dataset
******************************************************************************************
// based on the matching procedure in Dustmann, Schönberg, Stuhler (2017), QJE: for details see description there
// for details see description there, same weights are applied here as treatment and control regions are identical

sort ao_kreis_imp
merge m:1 ao_kreis_imp using "data/orig/controlregions_matching.dta", keepusing(ao_kreis_imp weight_matching)
replace  weight_matching = 1  if border_imp == 1 & _merge == 1		// set matching weight = 1 if border region
drop if _merge == 2		// all control regions are matched
drop _merge

label var weight_matching "matching weight: 0-1 for contrl. district, 1 for treat. district"


******************************************************************************************
*** merge data on occupational tasks
******************************************************************************************

gen occupthree = beruford_imp
gen occuptwo = floor(beruford_imp/10)

sort occupthree
merge m:1 occupthree using "data/bibb_task_3d.dta", keepusing(occupthree bibbtaskthreedigit)
drop if _merge == 2
drop _merge

merge m:1 occuptwo using "data/bibb_task_2d.dta", keepusing(occuptwo bibbtasktwodigit)
drop if _merge == 2
drop _merge

gen 	task_2dig_2 = 1  if bibbtasktwodigit == 1 | bibbtasktwodigit == 2
replace task_2dig_2 = 2  if bibbtasktwodigit == 3 
label var task_2dig_2 "task, 1=manual/routine, 2=abstract (based on beruford_imp,2digit)" 

gen 	task_3dig_2 = 1  if bibbtaskthreedigit == 1 | bibbtaskthreedigit == 2
replace task_3dig_2 = 2  if bibbtaskthreedigit == 3 
label var task_3dig_2 "task, 1=manual/routine, 2=abstract (based on beruford_imp,3digit)" 	


******************************************************************************************
*** save the new dataset
******************************************************************************************

sort vsnr_ano year
compress
label data "unrestricted estimation sample (1980-2000)"
notes drop _dta
notes: sample between 1980 & 2000
notes: vsnr is classified as employed if employed at 30/06 in respective year
save "data/work.dta", replace


******************************************************************************************
*** end
******************************************************************************************

exit


*========================================================================================*
Comments:
- unique identifier: vsnr_ano year
